{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "bert_text_classification.ipynb",
      "version": "0.3.2",
      "provenance": [],
      "collapsed_sections": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/wshuyi/demo-text-binary-classification-with-bert/blob/master/bert_text_classification.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "metadata": {
        "id": "zwYoyLfLQI-t",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "base code borrowed from [this Google Colab Notebook](https://colab.research.google.com/github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb).\n",
        "\n",
        "Refactored by [Shuyi Wang](https://www.linkedin.com/in/shuyi-wang-b3955026/) and [Yan Sun](https://github.com/SunYanCN)\n",
        "\n",
        "Please refer to [this Medium Article](https://medium.com/@wshuyi/how-to-do-text-binary-classification-with-bert-f1348a25d905) for detailed information.\n",
        "\n"
      ]
    },
    {
      "metadata": {
        "id": "EQHcJlAgHZ2o",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "!pip install bert-text"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "oZFDseXWHiSz",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "from bert_text import run_on_dfs"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "R4pBwSidH4CT",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "import pickle"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "wiOyfxH6H5ry",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "!wget https://github.com/wshuyi/info-5731-public/raw/master/imdb-sample.pickle"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "vEGtiE42IDyj",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "with open(\"imdb-sample.pickle\", 'rb') as f:\n",
        "  train, test = pickle.load(f)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "P6wlSR8NIKHS",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "train = train.sample(len(train))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "qCoK42AGIXsS",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "myparam = {\n",
        "    \"DATA_COLUMN\": \"text\",\n",
        "    \"LABEL_COLUMN\": \"sentiment\",\n",
        "    \"LEARNING_RATE\": 2e-5,\n",
        "    \"NUM_TRAIN_EPOCHS\": 3\n",
        "}"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "SUwqGJuIfgap",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "import tensorflow as tf\n",
        "tf.logging.set_verbosity(tf.logging.INFO)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "YoOVNBr7IsjS",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "result, estimator = run_on_dfs(train, test, **myparam)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "kaZMjQ0cIw9y",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "result"
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}